started: 01Mar2016
last updated: 19Jan2017
The filters are applied in the following order:
gq > 20
dp < 500
call_rate > 0.8
gq 20 filter is set arbitrary; however, it is consistent with what is done by others (e.g. see Carson BMC Bioinformatics. 2014 15:125).
A small number of genotypes (<<1%) was covered too high to be true (up to 1-2k coverage). These are obvious mistakes, and they have been removed too. Arbitrarily the threshold for max DP was set to 500 (appr. 10 fold of average coverage).
It was discussed with DC whether to filter cases by call rate per case. There was ~3 cases with low coverage (<20) and low call rates (<50%). We desided to keep such cases because their retained genotypes still passed all filters.
# Time stamp
Sys.time()
[1] "2017-01-19 20:53:24 GMT"
# Folders
setwd("/scratch/medgen/scripts/wecare_stat_01.17/scripts")
interim_data_folder <- "/scratch/medgen/scripts/wecare_stat_01.17/interim_data"
# Thresholds for genotypes
min.gq <- 20
max.dp <- 500
# Variants call ratethreshold
min.call.rate <- 0.8
load(paste(interim_data_folder, "r01_read_and_clean_data_jan2017.RData", sep="/"))
dim(gt.mx)
[1] 343824 710
class(gt.mx)
[1] "matrix"
gt.mx[1:5,1:5]
HG00097 HG00099 HG00100 HG00102 HG00106
Var000000001 NA 0 0 NA NA
Var000000002 0 0 NA NA 0
Var000000003 0 0 0 0 0
Var000000004 0 0 0 0 0
Var000000005 0 NA 0 0 0
dim(gq.mx)
[1] 343824 710
class(gq.mx)
[1] "matrix"
gq.mx[1:5,1:5]
HG00097 HG00099 HG00100 HG00102 HG00106
Var000000001 NA 9 3 NA NA
Var000000002 99 99 NA NA 58
Var000000003 36 99 21 12 21
Var000000004 36 99 18 3 21
Var000000005 36 NA 9 99 33
dim(dp.mx)
[1] 343824 710
class(dp.mx)
[1] "matrix"
dp.mx[1:5,1:5]
HG00097 HG00099 HG00100 HG00102 HG00106
Var000000001 0 4 1 0 NA
Var000000002 74 169 0 0 130
Var000000003 13 35 7 4 8
Var000000004 13 35 7 1 8
Var000000005 28 NA 10 37 20
dim(covar.df)
[1] 498 34
str(covar.df)
'data.frame': 498 obs. of 34 variables:
$ Subject_ID : int 200054 200491 200565 200698 200958 201046 201558 201921 202026 202236 ...
$ setno : int 382125 204356 360683 226881 357431 374980 201558 201921 213991 385058 ...
$ cc : int 0 0 0 0 0 0 1 1 0 0 ...
$ chemo : int 1 1 0 0 1 1 1 1 1 1 ...
$ hormone : int 0 1 1 0 1 0 0 1 1 0 ...
$ chemo_hormone : Factor w/ 5 levels "","both","chem",..: 3 2 4 5 2 3 3 2 2 3 ...
$ chemo_self_mra : int 1 1 0 0 1 1 1 1 1 1 ...
$ hormone_self_mra: int 0 1 1 0 1 0 0 1 1 0 ...
$ treatment : int 1 1 1 0 1 1 1 1 1 1 ...
$ ID : int 2 6 7 9 11 12 16 22 24 26 ...
$ labid : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
$ status : int 0 0 0 0 0 0 1 1 0 0 ...
$ status2 : int 0 0 0 0 0 0 1 1 0 0 ...
$ offset : num 6.41 5.82 5.61 4.03 4.77 ...
$ sub_dx_age : int 46 50 46 44 43 39 45 40 43 36 ...
$ XRTBreast : int 0 0 0 1 0 1 0 0 1 0 ...
$ Eigen_1 : num -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
$ Eigen_2 : num 0.0055 0.00414 0.00302 0.00301 0.0023 ...
$ Eigen_3 : num -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
$ Eigen_4 : num 0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
$ Eigen_5 : num 0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
$ dose : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
$ dsmiss : int 0 0 0 0 1 0 0 0 0 0 ...
$ good_location : int 1 1 0 1 1 1 0 1 0 0 ...
$ Deleterious : int 0 0 0 0 0 0 0 0 0 0 ...
$ registry : Factor w/ 5 levels "IA","IR","LA",..: 5 2 2 4 4 3 2 1 1 5 ...
$ race : int 0 0 0 0 0 0 0 0 0 0 ...
$ age_stratum1 : Factor w/ 5 levels "20to34","35to39",..: 4 4 4 3 3 2 3 3 3 2 ...
$ dxyr_stratum : int 2 2 3 1 1 2 1 2 3 2 ...
$ CMF_Only : int 1 0 0 0 0 1 0 0 1 1 ...
$ family_history : int 0 0 0 0 0 0 1 0 0 0 ...
$ sub_dx_age_cat : int 0 0 0 1 1 1 0 1 1 1 ...
$ CMF : Factor w/ 3 levels "CMF","Oth","no": 1 2 3 3 2 1 2 2 1 1 ...
$ XRTBrCHAR : int 0 0 0 1 0 1 0 0 1 0 ...
covar.df[1:5,1:5]
dim(samples.df)
[1] 512 4
str(samples.df)
'data.frame': 512 obs. of 4 variables:
$ wes_id : Factor w/ 512 levels "P1_A01","P1_A02",..: 1 2 3 4 5 6 7 8 9 10 ...
$ gwas_id : Factor w/ 510 levels "id200054","id200491",..: 14 405 315 264 67 121 326 251 281 141 ...
$ merged_id: Factor w/ 512 levels "P1_A01_id203568",..: 1 2 3 4 5 6 7 8 9 10 ...
$ filter : Factor w/ 5 levels "duplicate","low_concordance",..: 5 5 5 5 5 5 5 5 5 5 ...
samples.df[1:5,]
dim(demographics.df)
[1] 505 91
str(demographics.df)
'data.frame': 505 obs. of 91 variables:
$ Subject_ID : Factor w/ 503 levels "200054","200491",..: 1 2 3 4 5 6 7 8 9 10 ...
$ ID.x : num 2 6 7 9 11 12 NA NA 24 26 ...
$ labid.x : Factor w/ 498 levels "15582015","19212019",..: 6 7 8 9 10 11 NA NA 12 13 ...
$ Eigen_1.x : num -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
$ Eigen_2.x : num 0.0055 0.00414 0.00302 0.00301 0.0023 ...
$ Eigen_3.x : num -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
$ Eigen_4.x : num 0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
$ Eigen_5.x : num 0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
$ Phase : num 1 1 1 1 1 1 NA NA 1 1 ...
$ setno.x : num 382125 204356 360683 226881 357431 ...
$ cc.x : int 0 0 0 0 0 0 NA NA 0 0 ...
$ rstime : num 7.42 9.75 6.09 6.25 9.34 7 NA NA 6.09 8.66 ...
$ registry.x : Factor w/ 7 levels "","IA","IR","LA",..: 6 3 3 5 5 4 NA NA 2 6 ...
$ race.x : num 0 0 0 0 0 0 NA NA 0 0 ...
$ offset.x : num 6.41 5.82 5.61 4.03 4.77 ...
$ DOB : num -5049 -7459 -3916 -5890 -6407 ...
$ sub_dx_age.x : num 46 50 46 44 43 39 NA NA 43 36 ...
$ refage : num 53 59 51 50 52 45 NA NA 48 44 ...
$ BMI_age18 : num 20.2 19.7 23.3 19.5 25.8 ...
$ BMI_dx : num 22.8 20.9 23.3 32.6 25.8 ...
$ BMI_ref : num 25.7 22 25.8 32.6 28.1 ...
$ hormone_self_mra.x : num 0 1 1 0 1 0 NA NA 1 0 ...
$ chemo_self_mra.x : num 1 1 0 0 1 1 NA NA 1 1 ...
$ treatment.x : num 1 1 1 0 1 1 NA NA 1 1 ...
$ family_history.x : Factor w/ 3 levels "1+","none","othe": 2 2 2 2 2 2 NA NA 2 2 ...
$ rh_age_menarche : num 13 14 13 13 12 9 NA NA 12 11 ...
$ age_menopause_1yrbf_fd: num -1 48 -1 -1 -1 -1 NA NA -1 -1 ...
$ age_1fftp_fd : num 20 22 0 29 28 0 NA NA 27 24 ...
$ Num_ftp_fd : num 1 2 -1 2 2 -1 NA NA 3 2 ...
$ Histology : Factor w/ 4 levels "lobular","medullar",..: 3 3 3 2 3 3 NA NA 3 3 ...
$ Histology1 : Factor w/ 4 levels "lobular","medullar",..: 3 3 3 2 3 3 NA NA 3 3 ...
$ Hist_lob_other : Factor w/ 4 levels "lobular","other",..: 2 2 2 2 2 2 NA NA 2 2 ...
$ stage_fd : num 2 1 1 1 2 2 NA NA 2 2 ...
$ er_fd : num 1 1 1 4 1 2 NA NA 2 2 ...
$ pr_fd : num 1 1 1 2 1 2 NA NA 1 0 ...
$ histo1_cat : Factor w/ 9 levels "Tubular/mucinous",..: 9 4 4 6 4 4 NA NA 4 4 ...
$ er1_cat : Factor w/ 5 levels "negative","own unkn",..: 3 3 3 5 3 1 NA NA 1 1 ...
$ pr1_cat : Factor w/ 7 levels "negative","own 0 Ot",..: 3 3 3 1 3 1 NA NA 3 7 ...
$ status.x : num 0 0 0 0 0 0 NA NA 0 0 ...
$ status2.x : Factor w/ 4 levels "","0","1","h": 2 2 2 2 2 2 NA NA 2 2 ...
$ sub_race : num 0 0 0 0 0 0 NA NA 0 0 ...
$ er1_num : num 1 1 1 NA 1 0 NA NA 0 0 ...
$ er1 : int 1 1 1 NA 1 0 NA NA 0 0 ...
$ horm_tmx : num 0 1 1 0 2 0 NA NA 1 0 ...
$ XRTBreast.x : num 0 0 0 1 0 1 NA NA 1 0 ...
$ dose_caseloc : num 0 0 0 0.96 NA 0.88 NA NA 0.76 0 ...
$ good_location.x : num 1 1 0 1 1 1 NA NA 0 0 ...
$ avedose : num 0 0 0 1.65 NA 1.45 NA NA 0.91 0 ...
$ tmx : num 0 1 1 0 0 0 NA NA 1 0 ...
$ num_preg : num 1 1 0 1 1 0 NA NA 1 1 ...
$ fam_hist : num 0 0 0 0 0 0 NA NA 0 0 ...
$ age_menarche : int 1 1 1 1 0 0 NA NA 0 0 ...
$ lobular : num 0 0 0 0 0 0 NA NA 0 0 ...
$ age_menopause : int 0 2 0 0 0 0 NA NA 0 0 ...
$ conf_miss : num 0 0 0 0 0 0 NA NA 0 0 ...
$ dose_num : num 0 0 0 1 NA 1 NA NA 1 0 ...
$ cmf : Factor w/ 6 levels "","\024\x9c@",..: 3 4 5 5 3 3 NA NA 3 3 ...
$ cmf_012 : num 1 2 0 0 1 1 NA NA 1 1 ...
$ setno.y : int 382125 204356 360683 226881 357431 374980 201558 201921 213991 385058 ...
$ cc.y : int 0 0 0 0 0 0 1 1 0 0 ...
$ chemo : int 1 1 0 0 1 1 1 1 1 1 ...
$ hormone : int 0 1 1 0 1 0 0 1 1 0 ...
$ chemo_hormone : Factor w/ 5 levels "","both","chem",..: 3 2 4 5 2 3 3 2 2 3 ...
$ chemo_self_mra.y : int 1 1 0 0 1 1 1 1 1 1 ...
$ hormone_self_mra.y : int 0 1 1 0 1 0 0 1 1 0 ...
$ treatment.y : int 1 1 1 0 1 1 1 1 1 1 ...
$ ID.y : int 2 6 7 9 11 12 16 22 24 26 ...
$ labid.y : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
$ status.y : int 0 0 0 0 0 0 1 1 0 0 ...
$ status2.y : int 0 0 0 0 0 0 1 1 0 0 ...
$ offset.y : num 6.41 5.82 5.61 4.03 4.77 ...
$ sub_dx_age.y : int 46 50 46 44 43 39 45 40 43 36 ...
$ XRTBreast.y : int 0 0 0 1 0 1 0 0 1 0 ...
$ Eigen_1.y : num -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
$ Eigen_2.y : num 0.0055 0.00414 0.00302 0.00301 0.0023 ...
$ Eigen_3.y : num -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
$ Eigen_4.y : num 0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
$ Eigen_5.y : num 0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
$ dose : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
$ dsmiss : int 0 0 0 0 1 0 0 0 0 0 ...
$ good_location.y : int 1 1 0 1 1 1 0 1 0 0 ...
$ Deleterious : int 0 0 0 0 0 0 0 0 0 0 ...
$ registry.y : Factor w/ 5 levels "IA","IR","LA",..: 5 2 2 4 4 3 2 1 1 5 ...
$ race.y : int 0 0 0 0 0 0 0 0 0 0 ...
$ age_stratum1 : Factor w/ 5 levels "20to34","35to39",..: 4 4 4 3 3 2 3 3 3 2 ...
$ dxyr_stratum : int 2 2 3 1 1 2 1 2 3 2 ...
$ CMF_Only : int 1 0 0 0 0 1 0 0 1 1 ...
$ family_history.y : int 0 0 0 0 0 0 1 0 0 0 ...
$ sub_dx_age_cat : int 0 0 0 1 1 1 0 1 1 1 ...
$ CMF : Factor w/ 3 levels "CMF","Oth","no": 1 2 3 3 2 1 2 2 1 1 ...
$ XRTBrCHAR : int 0 0 0 1 0 1 0 0 1 0 ...
demographics.df[1:5,1:5]
dim(phenotypes_update.df)
[1] 13 22
str(phenotypes_update.df)
'data.frame': 13 obs. of 22 variables:
$ gwas_id : chr "id319270" "id298378" "id271981" "id301570" ...
$ wes_id : chr "P1_C02" "P1_C04" "P1_C06" "P1_C11" ...
$ cc : int 0 0 0 0 1 0 1 1 0 0 ...
$ setno : int 227587 243637 332699 247333 204830 204830 310424 247333 398607 201558 ...
$ family_history: int 1 1 1 0 1 0 1 1 0 0 ...
$ age_dx : int 30 30 31 30 24 23 31 33 49 40 ...
$ age_ref : int 39 33 32 34 28 27 32 37 54 47 ...
$ rstime : num 9.08 3.17 2.09 4.5 3.83 3.83 1.33 4.5 5.16 7.25 ...
$ Eigen_1 : num -0.01369 -0.00753 -0.00687 -0.00363 -0.01057 ...
$ Eigen_2 : num 0.00615 0.00853 0.00138 0.00428 0.00317 ...
$ Eigen_3 : num -0.00474 -0.01256 0.00329 0.03749 -0.00716 ...
$ Eigen_4 : num -0.00596 0.00849 -0.01233 -0.02952 -0.00024 ...
$ Eigen_5 : num -0.01219 0.01074 -0.0138 -0.0202 -0.00147 ...
$ stage : int 1 1 2 2 2 1 2 1 1 1 ...
$ er1 : num 0 1 0 1 0 1 1 NA 1 NA ...
$ pr1 : num 0 1 NA 1 0 1 0 NA 1 NA ...
$ hist_cat : chr "other carcinoma" "ductal" "ductal" "ductal" ...
$ hormone : num 0 0 0 0 0 0 0 0 NA 0 ...
$ chemo_cat : chr "CMF" "no" "Oth" "no" ...
$ br_xray : int 1 0 0 0 0 1 0 1 0 1 ...
$ br_xray_dose : num 1.1 0 0 0 0 1.2 0 1.9 0 0.86 ...
$ num_preg : int 2 0 1 1 1 0 0 1 1 0 ...
phenotypes_update.df[1:5,1:5]
dim(BRCA1_BRCA2_PALB2_cases.df)
[1] 11 12
str(BRCA1_BRCA2_PALB2_cases.df)
'data.frame': 11 obs. of 12 variables:
$ Cases_wes_id : Factor w/ 11 levels "P1_A09","P1_C02",..: 1 2 3 4 5 6 7 8 9 10 ...
$ Cases_gwas_id: Factor w/ 11 levels "id204830","id247333",..: 8 9 3 1 6 2 4 11 5 7 ...
$ SYMBOL : Factor w/ 3 levels "BRCA1","BRCA2",..: 1 3 1 1 2 1 2 2 3 3 ...
$ TYPE : Factor w/ 2 levels "INDEL","SNP": 2 1 2 2 2 1 1 1 2 2 ...
$ CHROM : int 17 16 17 17 13 17 13 13 16 16 ...
$ POS.GRCh37 : int 41256985 23641422 41243800 41258504 32936732 41276044 32914437 32914437 23625324 23634452 ...
$ REF : Factor w/ 6 levels "A","C","G","GT",..: 5 6 2 1 3 1 4 4 2 2 ...
$ ALT : Factor w/ 5 levels "A","ACT","C",..: 3 5 1 3 3 2 4 4 4 4 ...
$ rs_id : Factor w/ 9 levels "rs28897672","rs28897686",..: 7 5 2 1 8 6 9 9 4 3 ...
$ Consequence : Factor w/ 6 levels "frameshift_variant",..: 2 1 6 3 3 1 1 1 5 4 ...
$ wes_var_id : Factor w/ 9 levels "Var000223294",..: 7 5 6 8 2 9 1 1 3 4 ...
$ Note : Factor w/ 2 levels "","Added for QC": NA NA 2 2 NA 2 NA NA NA NA ...
BRCA1_BRCA2_PALB2_cases.df[1:5,1:5]
dim(vv.df)
[1] 343824 35
str(vv.df)
'data.frame': 343824 obs. of 35 variables:
$ SplitVarID : Factor w/ 343824 levels "Var000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
$ TYPE : Factor w/ 2 levels "INDEL","SNP": 1 2 2 2 2 2 2 2 2 2 ...
$ ID : Factor w/ 253381 levels "rs10000804","rs10000924",..: NA 234676 201998 253336 54527 NA NA 162750 NA NA ...
$ CHROM : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
$ POS : int 664486 762330 865628 865694 871159 871171 871173 871215 871230 871271 ...
$ REF : Factor w/ 2142 levels "A","AAAAAAAAAAAAAAAAAAAAAAAG",..: 1653 1044 1044 539 1044 1 539 539 539 539 ...
$ ALT : Factor w/ 1297 levels "A","AAAAAAAAAAAAACT",..: 989 989 1 989 1 664 1 664 989 989 ...
$ QUAL : num 35191 7081 6420 991 649 ...
$ DP : int 49355 26720 14628 8357 11648 12903 13516 16282 15415 10748 ...
$ AS_VQSLOD : num 1.57 7.04 16.2 18.18 16.05 ...
$ FILTER : Factor w/ 1 level "PASS": 1 1 1 1 1 1 1 1 1 1 ...
$ AC : int 53 8 7 4 1 1 1 5 1 1 ...
$ AF : num 0.044 0.00622 0.00519 0.00318 0.00082 ...
$ AN : int 1206 1286 1348 1258 1220 1250 1258 1312 1310 1294 ...
$ NEGATIVE_TRAIN_SITE: logi NA NA NA NA NA NA ...
$ POSITIVE_TRAIN_SITE: logi NA NA NA TRUE NA NA ...
$ SYMBOL : Factor w/ 19874 levels "A1BG","A1CF",..: 14360 9127 14922 14922 14922 14922 14922 14922 14922 14922 ...
$ Allele : Factor w/ 1048 levels "-","A","AA","AAA",..: 1 790 2 790 2 533 2 533 790 790 ...
$ Existing_variation : Factor w/ 307403 levels "","1KG_2_227731951",..: NA 255726 203059 307355 55938 218187 NA 163633 277286 NA ...
$ Consequence : Factor w/ 80 levels "3_prime_UTR_variant",..: 80 32 28 28 28 28 78 78 78 28 ...
$ IMPACT : Factor w/ 4 levels "HIGH","LOW","MODERATE",..: 4 4 3 3 3 3 2 2 2 3 ...
$ CLIN_SIG : Factor w/ 93 levels "association",..: NA NA NA NA NA NA NA NA NA NA ...
$ cDNA_position : Factor w/ 16588 levels "0-1","1","1-18",..: NA 11562 6108 7438 8899 9105 9146 9809 10047 10653 ...
$ CDS_position : Factor w/ 15333 levels "1","1-18","1-2",..: NA NA 3598 5147 6675 6879 6912 7603 7853 8506 ...
$ Codons : Factor w/ 3014 levels "-/A","-/AA","-/AAAA",..: NA NA 698 472 690 319 1167 1753 1848 1532 ...
$ Protein_position : Factor w/ 7359 levels "1","1-2","1-6",..: NA NA 5937 6741 125 216 216 514 620 910 ...
$ Amino_acids : Factor w/ 1851 levels "*","*/*EX","*/C",..: NA NA 589 695 589 1586 1583 1100 1329 1107 ...
$ DISTANCE : int 960 NA NA NA NA NA NA NA NA NA ...
$ STRAND : int -1 -1 1 1 1 1 1 1 1 1 ...
$ SYMBOL_SOURCE : Factor w/ 6 levels "Clone_based_ensembl_gene",..: 2 3 3 3 3 3 3 3 3 3 ...
$ SIFT_call : Factor w/ 4 levels "deleterious",..: NA NA 2 1 3 1 NA NA NA 4 ...
$ SIFT_score : num NA NA 0.01 0.04 1 0.05 NA NA NA 0.09 ...
$ PolyPhen_call : Factor w/ 4 levels "benign","possibly_damaging",..: NA NA 1 2 1 1 NA NA NA 1 ...
$ PolyPhen_score : num NA NA 0.099 0.493 0.002 0.018 NA NA NA 0.045 ...
$ Multiallelic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
vv.df[1:5,1:5]
dim(kgen.df)
[1] 343824 9
str(kgen.df)
'data.frame': 343824 obs. of 9 variables:
$ SplitVarID : Factor w/ 343824 levels "Var000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
$ kgen.AC : int NA NA 14 263 2 1 NA NA NA NA ...
$ kgen.AN : int NA NA 5008 5008 5008 5008 NA NA NA NA ...
$ kgen.AF : num NA NA 0.002796 0.052516 0.000399 ...
$ kgen.AFR_AF: num NA NA 0 0.0227 0 0 NA NA NA NA ...
$ kgen.AMR_AF: num NA NA 0.0072 0.0965 0 0 NA NA NA NA ...
$ kgen.EAS_AF: num NA NA 0 0.139 0 ...
$ kgen.EUR_AF: num NA NA 0.005 0.002 0.002 0.001 NA NA NA NA ...
$ kgen.SAS_AF: num NA NA 0.0041 0.0245 0 0 NA NA NA NA ...
kgen.df[1:5,1:5]
dim(exac.df)
[1] 343824 48
str(exac.df)
'data.frame': 343824 obs. of 48 variables:
$ SplitVarID : Factor w/ 343824 levels "Var000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
$ exac_non_TCGA.AF : num NA 0.012 0.00278 0.025 0.00231 ...
$ exac_non_TCGA.AC : int NA 175 295 2670 245 NA NA NA NA NA ...
$ exac_non_TCGA.AN : int NA 14012 106020 106038 106210 NA NA NA NA NA ...
$ exac_non_TCGA.AC_FEMALE: int NA 59 94 1232 97 NA NA 1553 NA NA ...
$ exac_non_TCGA.AN_FEMALE: int NA 3452 18598 24670 45846 NA NA 45840 NA NA ...
$ exac_non_TCGA.AC_MALE : int NA 91 170 1159 148 NA NA 1483 NA NA ...
$ exac_non_TCGA.AN_MALE : int NA 8250 26380 33440 60312 NA NA 60304 NA NA ...
$ exac_non_TCGA.AC_Adj : int NA 150 264 2391 245 NA NA 3036 NA NA ...
$ exac_non_TCGA.AN_Adj : int NA 11702 44978 58110 106158 NA NA 106144 NA NA ...
$ exac_non_TCGA.AC_Hom : int NA 0 1 102 2 NA NA 136 NA NA ...
$ exac_non_TCGA.AC_Het : int NA 150 262 2187 241 NA NA 2760 NA NA ...
$ exac_non_TCGA.AC_Hemi : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_AFR : int NA 85 6 172 0 NA NA 263 NA NA ...
$ exac_non_TCGA.AN_AFR : int NA 422 4194 5062 9054 NA NA 9040 NA NA ...
$ exac_non_TCGA.Hom_AFR : int NA 0 0 2 0 NA NA 6 NA NA ...
$ exac_non_TCGA.Het_AFR : int NA 85 6 168 0 NA NA 251 NA NA ...
$ exac_non_TCGA.Hemi_AFR : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_AMR : int NA 2 20 979 0 NA NA 1180 NA NA ...
$ exac_non_TCGA.AN_AMR : int NA 224 3570 6030 11216 NA NA 11210 NA NA ...
$ exac_non_TCGA.Hom_AMR : int NA 0 0 46 0 NA NA 67 NA NA ...
$ exac_non_TCGA.Het_AMR : int NA 2 20 887 0 NA NA 1046 NA NA ...
$ exac_non_TCGA.Hemi_AMR : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_EAS : int NA 0 0 792 0 NA NA 889 NA NA ...
$ exac_non_TCGA.AN_EAS : int NA 314 3432 4930 7866 NA NA 7854 NA NA ...
$ exac_non_TCGA.Hom_EAS : int NA 0 0 47 0 NA NA 52 NA NA ...
$ exac_non_TCGA.Het_EAS : int NA 0 0 698 0 NA NA 785 NA NA ...
$ exac_non_TCGA.Hemi_EAS : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_FIN : int NA 0 9 4 149 NA NA 39 NA NA ...
$ exac_non_TCGA.AN_FIN : int NA 30 1376 1960 6614 NA NA 6614 NA NA ...
$ exac_non_TCGA.Hom_FIN : int NA 0 0 0 2 NA NA 0 NA NA ...
$ exac_non_TCGA.Het_FIN : int NA 0 9 4 145 NA NA 39 NA NA ...
$ exac_non_TCGA.Hemi_FIN : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_NFE : int NA 50 193 61 92 NA NA 172 NA NA ...
$ exac_non_TCGA.AN_NFE : int NA 2944 22156 28886 54312 NA NA 54328 NA NA ...
$ exac_non_TCGA.Hom_NFE : int NA 0 1 1 0 NA NA 2 NA NA ...
$ exac_non_TCGA.Het_NFE : int NA 50 191 59 92 NA NA 168 NA NA ...
$ exac_non_TCGA.Hemi_NFE : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_SAS : int NA 10 33 373 1 NA NA 478 NA NA ...
$ exac_non_TCGA.AN_SAS : int NA 7650 9972 10884 16402 NA NA 16404 NA NA ...
$ exac_non_TCGA.Hom_SAS : int NA 0 0 6 0 NA NA 9 NA NA ...
$ exac_non_TCGA.Het_SAS : int NA 10 33 361 1 NA NA 456 NA NA ...
$ exac_non_TCGA.Hemi_SAS : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_OTH : int NA 3 3 10 3 NA NA 15 NA NA ...
$ exac_non_TCGA.AN_OTH : int NA 118 278 358 694 NA NA 694 NA NA ...
$ exac_non_TCGA.Hom_OTH : int NA 0 0 0 0 NA NA 0 NA NA ...
$ exac_non_TCGA.Het_OTH : int NA 3 3 10 3 NA NA 15 NA NA ...
$ exac_non_TCGA.Hemi_OTH : int NA NA NA NA NA NA NA NA NA NA ...
exac.df[1:5,1:5]
# Check consistence of rownames
sum(rownames(gt.mx) != rownames(gq.mx))
[1] 0
sum(rownames(gt.mx) != rownames(dp.mx))
[1] 0
sum(rownames(gt.mx) != rownames(vv.df))
[1] 0
sum(rownames(gt.mx) != rownames(kgen.df))
[1] 0
sum(rownames(gt.mx) != rownames(exac.df))
[1] 0
# Check consistence of colnames
sum(colnames(gt.mx) != colnames(gq.mx))
[1] 0
sum(colnames(gt.mx) != colnames(dp.mx))
[1] 0
Genotypes NA rates Histogram of call rates per variant
Histograms of gq and dp in non-NA genotypes
# Fraction of NA genotypes before filtering
sum(is.na(gt.mx))/(dim(gt.mx)[1]*dim(gt.mx)[2]) # ~4%
[1] 0.0406923
# Call rates per variant before filtering
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, xlab=NULL, main="Call rates per variant before genotypes filtering")
# Histogram of gq before filtering (when gt is not NA !)
hist(gq.mx[!is.na(gt.mx)], breaks=50, main="Histogram of gq in non-NA genotypes (before filtering)", xlab=NULL)
# Histogram of dp before filtering (when gt is not NA !)
hist(dp.mx[!is.na(gt.mx)], breaks=50, main="Histogram of dp in non-NA genotypes (before filtering)", xlab=NULL)
hist(dp.mx[!is.na(gt.mx)], breaks=2500, xlim=c(0,100), main="Histogram of dp in non-NA genotypes (before filtering, 0:100)", xlab=NULL)
rm(x,y)
Put NA to genotypes where gq < 20 : removes ~8% of non-NA genotypes
# num of genotypes to be removed
sum(gq.mx < min.gq, na.rm=TRUE)
[1] 19243468
# Fraction of genotypes to be removed
sum(gq.mx < min.gq, na.rm=TRUE)/sum(!is.na(gq.mx)) # ~8%
[1] 0.08235379
# Apply filter (to gt only !)
NA -> gt.mx[ gq.mx < min.gq ]
# Clean up
rm(min.gq)
dim() genotypes NA rates Histogram of call rates per variant Histograms of gq and dp in non-NA genotypes
dim(gt.mx)
[1] 343824 710
# Fraction of NA genotypes after filtering
sum(is.na(gt.mx))/(dim(gt.mx)[1]*dim(gt.mx)[2]) # ~12%
[1] 0.1195218
# Call rates per variant after gq filtering
x <- ncol(gt.mx)
y <- apply(gt.mx, 1, function(z){1-sum(is.na(z))/x})
hist(y, xlab=NULL, main="Histogram of call rates per variant after gt filtering")
# Histogram of gq after gq filtering (when gt is not NA !)
hist(gq.mx[!is.na(gt.mx)], xlim=c(0,100), breaks=50, main="Histogram of gq in non NA genotypes (after gq filtering)", xlab=NULL)
View(vv.df)
# Histogram of dp after gt filtering (when gt is not NA !)
hist(dp.mx[!is.na(gt.mx)], breaks=50, main="Histogram of dp in non-NA genotypes (after gt filtering)", xlab=NULL)
hist(dp.mx[!is.na(gt.mx)], xlim=c(0,100), breaks=2500, main="Histogram of dp in non-NA genotypes (after gt filtering, 0:100)", xlab=NULL)
# Clean up
rm(x, y)
put NA to genotypes where dp > 500 : removes <<1% of non-NA genotypes
# num of genotypes to be removed
sum(dp.mx > max.dp, na.rm=TRUE)
[1] 45313
# Fraction of genotypes to be removed (appr)
sum(dp.mx > max.dp, na.rm=TRUE)/sum(!is.na(gq.mx)) # <<1%
[1] 0.0001939202
# Apply filter (to gt only !)
NA -> gt.mx[ dp.mx > max.dp ]
# Clean up
rm(max.dp)
dim() Genotypes NA rates Histogram of call rates per variant Histograms of gq and dp
dim(gt.mx)
[1] 343824 710
# Fraction of NA genotypes after filtering
sum(is.na(gt.mx))/(dim(gt.mx)[1]*dim(gt.mx)[2]) # ~12%
[1] 0.1196978
# Call rates per variant after gq filtering
x <- ncol(gt.mx)
y <- apply(gt.mx, 1, function(z){1-sum(is.na(z))/x})
hist(y, xlab=NULL, main="Call rates per variant after gt+dp filtering")
# Histogram of gq after gq+dp filtering (when gt is not NA !)
hist(gq.mx[!is.na(gt.mx)], xlim=c(0,100), breaks=50, main="Histogram of gq in non-NA genotypes (after gq+dp filtering)", xlab=NULL)
# Histogram of dp after gt filtering (when gt is not NA !)
hist(dp.mx[!is.na(gt.mx)], breaks=50, main="Histogram of dp after gt+dp filtering", xlab=NULL)
hist(dp.mx[!is.na(gt.mx)], xlim=c(0,100), breaks=500, main="Histogram of dp in non-NA genotypes (after gt+dp filtering, 0:100)", xlab=NULL)
# Clean up
rm(x, y)
Remove variants with call rate < 80% after the gemnotypes filtering: removes ~18% of variants (343,824 -> 283,651)
# Estimate the proportion of variants to be removed
x <- ncol(gt.mx)
y <- apply(gt.mx, 1, function(z){1-sum(is.na(z))/x})
y[1:7]
Var000000001 Var000000002 Var000000003 Var000000004 Var000000005 Var000000006 Var000000007
0.7211268 0.8112676 0.7887324 0.5647887 0.6760563 0.7098592 0.7450704
var.retained <- y >= min.call.rate
sum(var.retained) # 283.651
[1] 283651
1 - sum(var.retained)/nrow(gt.mx) # ~18%
[1] 0.1750111
# Remove variants with loaw call rates
gt.mx <- gt.mx[ var.retained, ]
dp.mx <- dp.mx[ var.retained, ]
gq.mx <- gq.mx[ var.retained, ]
vv.df <- vv.df[ var.retained, ]
kgen.df <- kgen.df[ var.retained, ]
exac.df <- exac.df[ var.retained, ]
# Clean-up
rm(min.call.rate, var.retained, x, y)
Histograms of gq and dp
NA rates in gq table
Histogram of call rates per variant
dim(gt.mx)
[1] 283651 710
# Fraction of NA genotypes after filtering
sum(is.na(gt.mx))/(dim(gt.mx)[1]*dim(gt.mx)[2]) # ~8%
[1] 0.07691938
# Call rates per variant after filtering
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, xlim=c(0,1), breaks=10, xlab=NULL, main="Call rates per variant after gq+dp+cr genotypes filtering")
# Histogram of gq after filtering (when gt is not NA !)
hist(gq.mx[!is.na(gt.mx)], xlim=c(0,100), breaks=50, main="Histogram of gq in non-NA genotypes (after gq+dp+cr filtering)", xlab=NULL)
# Histogram of dp after filtering (when gt is not NA !)
hist(dp.mx[!is.na(gt.mx)], breaks=50, main="Histogram of dp in non-NA genotypes (after gq+dp+cr filtering)", xlab=NULL)
hist(dp.mx[!is.na(gt.mx)], breaks=500, xlim=c(0,100), main="Histogram of dp in non-NA genotypes (after gq+dp+cr filtering, 0:100)", xlab=NULL)
# Clean-up
rm(x, y)
mean(gq.mx, na.rm=TRUE)
[1] 84.51277
mean(dp.mx, na.rm=TRUE)
[1] 37.31567
rm(gq.mx, dp.mx)
dim(gt.mx)
[1] 283651 710
class(gt.mx)
[1] "matrix"
gt.mx[1:5,1:5]
HG00097 HG00099 HG00100 HG00102 HG00106
Var000000002 0 0 NA NA 0
Var000000008 0 NA 0 0 0
Var000000009 0 0 0 0 0
Var000000012 0 0 NA 0 0
Var000000013 0 0 NA 0 0
dim(gq.mx)
Error: object 'gq.mx' not found
rr
ls() sessionInfo() Sys.time()